***************************************************************
*         BUILD DATASET FOR PATIENT RISK PREDICTION           *
***************************************************************

log using "${SIDCodePath}/sidsedd_dataforriskpredict.log", replace

// Switches
local byyear 	  	 	= 1
local appendyears 	 	= 1
local makepredictors 	= 1
local chronb 		 	= 1
local combinepredictors = 1

if `byyear' == 1{
	*-------------------------------------------------- 
	* For each year, create SID + SEDD merged dataset for machine learning algorithm
	*--------------------------------------------------
	// List of variables in datasets for each year
		local chgs_varlist       chg* key

		local core_varlist_10_15 age ahour aweekend daystoevent dhour died disp_x dispub04 dispuniform dqtr dshospid /// 
								 dx* e_ccs* ecode* female hcup_ed hcup_os hispanic_x homeless hospbrth hospst key los los_x mdnum* medincstq nchronic ndx necode ///
								 neomat npr orproc pay1 pay1_x pl_cbsa pl_ur_cat4 pointoforigin_x pointoforiginub04 pr* pstate pstco2 race race_x totchg totchg_x visitlink year ///
								 zip zipinc_qrtl 

		local core_varlist_15q4 age ahour aweekend daystoevent dhour died disp_x dispub04 dispuniform dqtr dshospid /// 
								dx* female hcup_ed hcup_os hispanic_x homeless hospbrth hospst key los los_x mdnum* medincstq nchronic ndx ///
								neomat npr orproc pay1 pay1_x pl_cbsa pl_ur_cat4 pointoforigin_x pointoforiginub04 pr* pstate pstco2 race race_x totchg totchg_x visitlink year ///
								zip zipinc_qrtl 

		local core_varlist_16   age ahour aweekend daystoevent dhour died disp_x dispub04 dispuniform dqtr dshospid /// 
							   	dx* female hcup_ed hcup_os hispanic_x homeless hospst key los los_x mdnum* medincstq ndx ///
								npr pay1 pay1_x pl_cbsa pl_ur_cat4 pointoforigin_x pointoforiginub04 pr* pstate pstco2 race race_x totchg totchg_x visitlink year ///
								zip zipinc_qrtl 

		local dx_varlist_10_15  chron* dx* e_mccs* key multinjury pclass* prmccs* 

		local dx_varlist_15q4   chron* dx* key multinjury pclass* prmccs* 

	// List of variables missing in datasets for each year
		local core_missing_15q4  = "e_ccs1 e_ccs2 e_ccs3 ecode1 ecode2 ecode3 necode"
		local core_missing_16    = "e_ccs1 e_ccs2 e_ccs3 ecode1 ecode2 ecode3 necode dxccs1 dxccs2 dxccs3 dxccs4 dxccs5 dxccs6 dxccs7 dxccs8 dxccs9 dxccs10 neomat orproc prccs1 prccs2 prccs3 prccs4 prccs5 hospbrth nchronic"
		local dx_missing_15q4    = "e_mccs1 e_mccs2 e_mccs3"
		local dx_missing_16      = "chron1 chron2 chron3 chron4 chron5 chron6 chron7 chron8 chron9 chron10 dxmccs1 dxmccs2 dxmccs3 dxmccs4 dxmccs5 dxmccs6 dxmccs7 dxmccs8 dxmccs9 dxmccs10 e_mccs1 e_mccs2 e_mccs3 multinjury pclass1 pclass2 pclass3 pclass4 pclass5 prmccs1 prmccs2 prmccs3 prmccs4 prmccs5"

		 forval i = 2010/2016{ 
			 	
				if `i' < 2015{

					// Merge all inpatient datasets for a year 
						use "${SIDPath}/`i'/fl_sid_`i'_chgs.dta", clear
						rename *, lower 
						keep 	   `chgs_varlist'
						duplicates drop key, force
						merge 	   1:1 key using "${SIDPath}/`i'/fl_sid_`i'_core.dta"
						rename *, lower 
						keep 	   `chgs_varlist' `core_varlist_10_15' edhour 
						duplicates drop key, force
						merge 	   1:1 key using "${SIDPath}/`i'/fl_sid_`i'_dx_pr_grps.dta" 
						rename *, lower 
						if year == 2010{
							gen multinjury = ""
						}
						keep 			`chgs_varlist' `core_varlist_10_15' `dx_varlist_10_15' edhour 
						gen inpatient = 1
						save "${SIDDataPath}/fl_sid_`i'_all_short.dta", replace
						
					// Merge all outpatient datasets for a year
						use "${SEDDPath}/`i'/fl_sedd_`i'_chgs.dta", clear
						rename *, lower 
						keep 		`chgs_varlist'
						duplicates drop key, force
						merge 		1:1 key using "${SEDDPath}/`i'/fl_sedd_`i'_core.dta"
						rename *, lower 
						keep 		`chgs_varlist' `core_varlist_10_15'
						duplicates drop key, force
						merge 		1:1 key using "${SEDDPath}/`i'/fl_sedd_`i'_dx_pr_grps.dta"
						rename *, lower 
						if year == 2010{
							gen multinjury = ""
						}
						keep 		`chgs_varlist' `core_varlist_10_15' `dx_varlist_10_15'
						rename ahour edhour
						gen inpatient = 0
						save "${SIDDataPath}/fl_sedd_`i'_all_short.dta", replace
						
				} 

				if `i' == 2015{

					// Merge all inpatient datasets for 2015 q1q3 
						use "${SIDPath}/`i'/fl_sid_`i'q1q3_chgs.dta", clear
						rename *, lower 
						keep 		`chgs_varlist'
						duplicates 	drop key, force
						merge 		1:1 key using "${SIDPath}/`i'/fl_sid_`i'q1q3_core.dta"
						rename *, lower 
						keep 		`chgs_varlist' `core_varlist_10_15' edhour
						duplicates drop key, force
						merge 		1:1 key using "${SIDPath}/`i'/fl_sid_`i'q1q3_dx_pr_grps.dta"
						rename *, lower 
						keep 		`chgs_varlist' `core_varlist_10_15' `dx_varlist_10_15' edhour
						gen inpatient = 1
						save "${SIDDataPath}/fl_sid_`i'q1q3_all_short.dta", replace

						
					// Merge all outpatient datasets for 2015 q1q3
						use "${SEDDPath}/`i'/fl_sedd_`i'q1q3_chgs.dta", clear
						rename *, lower 
						keep 	   `chgs_varlist'
						duplicates drop key, force
						merge 	   1:1 key using "${SEDDPath}/`i'/fl_sedd_`i'q1q3_core.dta"
						rename *, lower 
						keep 	   `chgs_varlist' `core_varlist_10_15'
						duplicates drop key, force
						merge 	   1:1 key using "${SEDDPath}/`i'/fl_sedd_`i'q1q3_dx_pr_grps.dta"
						rename *, lower 
						keep 	   `chgs_varlist' `core_varlist_10_15' `dx_varlist_10_15'
						rename ahour edhour
						gen inpatient = 0
						save "${SIDDataPath}/fl_sedd_`i'q1q3_all_short.dta", replace 
						

					// Merge all inpatient datasets for 2015 q4
						use "${SIDPath}/`i'/fl_sid_`i'q4_chgs.dta", clear
						rename *, lower
						keep 		`chgs_varlist'
						duplicates drop key, force
						merge 		1:1 key using "${SIDPath}/`i'/fl_sid_`i'q4_core.dta"
						rename *, lower
						rename 		i10_* *
						keep `chgs_varlist' `core_varlist_15q4' edhour 
						duplicates drop key, force
						merge 		1:1 key using "${SIDPath}/`i'/fl_sid_`i'q4_dx_pr_grps.dta"
						rename *, lower
						rename 		i10_* *
						keep 		`chgs_varlist' `core_varlist_15q4' `dx_varlist_15q4' edhour
						foreach var of local core_missing_15q4 { /* NOTE: if you want to define new variables based on a local list of strings, you need to define that you are using a *local* (not a varlist, otherwise it looks for those vars) and do not use the `' around the name*/
							gen `var' = ""
						}

						foreach var of local dx_missing_15q4 {
							gen `var' = ""
						}
						gen inpatient = 1
						save "${SIDDataPath}/fl_sid_`i'q4_all_short.dta", replace
						
					// Merge all outpatient datasets for 2015 q4
						use "${SEDDPath}/`i'/fl_sedd_`i'q4_chgs.dta", clear
						rename *, lower
						keep	   `chgs_varlist'
						duplicates drop key, force
						merge 	   1:1 key using "${SEDDPath}/`i'/fl_sedd_`i'q4_core.dta"
						rename *, lower
						rename 	   i10_* *
						keep 	   `chgs_varlist' `core_varlist_15q4'
						duplicates drop key, force
						merge 		1:1 key using "${SEDDPath}/`i'/fl_sedd_`i'q4_dx_pr_grps.dta"
						rename *, lower
						rename 		i10_* *
						keep 		`chgs_varlist' `core_varlist_15q4' `dx_varlist_15q4'
						foreach var of local core_missing_15q4 {
							gen `var' = ""
						}

						foreach var of local dx_missing_15q4 {
							gen `var' = ""
						} 
						rename ahour edhour
						gen inpatient = 0
					save "${SIDDataPath}/fl_sedd_`i'q4_all_short.dta", replace
					
					
					use "${SIDDataPath}/fl_sid_`i'q1q3_all_short.dta", clear
					append using "${SIDDataPath}/fl_sid_`i'q4_all_short.dta", force
					save "${SIDDataPath}/fl_sid_`i'_all_short.dta", replace

					use "${SIDDataPath}/fl_sedd_`i'q1q3_all_short.dta", clear
					append using "${SIDDataPath}/fl_sedd_`i'q4_all_short.dta", force
					save "${SIDDataPath}/fl_sedd_`i'_all_short.dta", replace
					
				} 

				if `i' == 2016 {
					// Merge all inpatient datasets for 2016
						use "${SIDPath}/`i'/fl_sid_`i'_chgs.dta", clear
						rename *, lower
						keep 	   `chgs_varlist'
						duplicates drop key, force
						merge 	   1:1 key using "${SIDPath}/`i'/fl_sid_`i'_core.dta"
						rename *, lower
						rename	   i10_* *
						keep 	   `chgs_varlist' `core_varlist_16' edhour
						foreach var of local core_missing_16 {
							gen `var' = ""
						}
						foreach var of local dx_missing_16 {
							gen `var' = ""
						}
						gen inpatient = 1
						save "${SIDDataPath}/fl_sid_`i'_all_short.dta", replace
						
					// Merge all outpatient datasets for 2016
						use "${SEDDPath}/`i'/fl_sedd_`i'_chgs.dta", clear
						rename *, lower
						keep 	   `chgs_varlist'
						duplicates drop key, force
						merge 	   1:1 key using "${SEDDPath}/`i'/fl_sedd_`i'_core.dta"
						rename *, lower
						rename 	   i10_* *
						keep 	   `chgs_varlist' `core_varlist_16'
						foreach var of local core_missing_16 {
							gen `var' = ""
						}

						foreach var of local dx_missing_16 {
							gen `var' = ""
						}
						rename ahour edhour
						gen inpatient = 0
						save "${SIDDataPath}/fl_sedd_`i'_all_short.dta", replace
						
				} 
		} 
}


*-------------------------------------------------- 
* Append all datasets together
*--------------------------------------------------
if `appendyears' == 1{
	use "${SIDDataPath}/fl_sedd_2010_all_short.dta", clear
	append using "${SIDDataPath}/fl_sid_2010_all_short.dta"

	forval i = 2011/2016{
				append using "${SIDDataPath}/fl_sedd_`i'_all_short.dta", force
				append using "${SIDDataPath}/fl_sid_`i'_all_short.dta", force
			}

	keep if hcup_ed > 0

	save "${SIDDataPath}/fl_sidsedd_allpredictors_1016.dta", replace  
}
	
	

	
*--------------------------------------------------
* Create datasets for 2010-2016
*--------------------------------------------------

// Create a dataset which contains:
	* # of visits within X days
	* # of inpatient stays within X days
	* # days in hospital within X days 

if `makepredictors' == 1{
	use "${SIDDataPath}/fl_sidsedd_allpredictors_1016.dta", clear
	keep if hcup_ed > 0

	gen 	trad_medicare = 0
	replace trad_medicare = 1 if pay1_x == "A"

	keep if trad_medicare == 1

	keep key visitlink daystoevent inpatient los 

	gen 	los_ED = 0
	replace los_ED = los if inpatient == 0

	gen 	los_ip = 0
	replace los_ip = los if inpatient == 1

	gen 	los_ip_gt2 = 0
	replace los_ip_gt2 = 1 if inpatient == 1 & los > 2

	rangestat (count) n_visits_30days    = daystoevent  /// 
			  (sum)   n_inpatient_30days = inpatient 	///
			  		  n_los_ip_30days  	 = los_ip       ///
			  		  n_los_ED_30days  	 = los_ED, 		///
			  interval(daystoevent -30 0) by(visitlink) excludeself

	rangestat (count) n_visits_365days    	 = daystoevent  /// 
			  (sum)   n_inpatient_365days 	 = inpatient 	///
			  		  n_los_ip_365days  	 = los_ip       ///
			  		  n_los_ED_365days  	 = los_ED, 		///
			  interval(daystoevent -365 0) by(visitlink) excludeself

	rangestat (count) n_visits_nxt30days      = daystoevent /// 
			  (sum)   n_inpatient_nxt30days   = inpatient 	///
			  		  n_ipgt2_nxt30days  	  = los_ip_gt2 	///
			  		  n_los_ip_nxt30days  	  = los_ip      ///
			  		  n_los_ED_nxt30days  	  = los_ED, 	///
			  interval(daystoevent 0 30) by(visitlink) excludeself

	rangestat (count) n_visits_nxt365days     = daystoevent /// 
		  (sum)   	  n_inpatient_nxt365days  = inpatient 	///
		  		  	  n_ipgt2_nxt365days  	  = los_ip_gt2 	///
		  		  	  n_los_ip_nxt365days  	  = los_ip      ///
		  		  	  n_los_ED_nxt365days  	  = los_ED, 	///
		  interval(daystoevent 0 365) by(visitlink) excludeself

	save "${SIDDataPath}/riskpredict/fl_med_1016_recentvisits.dta", replace

// Create a dataset which contains a dummy for whether the patient had a given CCS diagnosis
	use "${SIDDataPath}/fl_sidsedd_allpredictors_1016.dta", clear
	keep if hcup_ed > 0
	gen 	trad_medicare = 0
	replace trad_medicare = 1 if pay1_x == "A"

	keep if trad_medicare == 1
	keep key visitlink daystoevent dxccs*
		forval ccs = 1/261{
			egen d_dxccs_`ccs' = anymatch(dxccs*), values(`ccs')
		}

	save "${SIDDataPath}/riskpredict/fl_med_1016_d_dxccs.dta", replace

	use "${SIDDataPath}/riskpredict/fl_med_1016_d_dxccs.dta", clear
	keep key visitlink daystoevent d_dxccs_*
	forval ccs = 1/261{
		local rangestatinput "`rangestatinput' d_dxccs_`ccs'_30days = d_dxccs_`ccs'"
	}
	rangestat (sum) `rangestatinput', interval(daystoevent -30 0) by(visitlink) excludeself

	local rangestatinput ""
	forval ccs = 1/261{
		local rangestatinput "`rangestatinput' d_dxccs_`ccs'_365days = d_dxccs_`ccs'"
	}
	rangestat (sum) `rangestatinput', interval(daystoevent -365 0) by(visitlink) excludeself
	save "${SIDDataPath}/riskpredict/fl_med_1016_recentdxccs.dta", replace

// Create a dataset which contains a dummy for whether the patient had a given CCS procedure
	use "${SIDDataPath}/fl_sidsedd_allpredictors_1016.dta", clear
	keep if hcup_ed > 0
	gen 	trad_medicare = 0
	replace trad_medicare = 1 if pay1_x == "A"
	keep if trad_medicare == 1
	keep key visitlink daystoevent prccs*
		forval ccs = 1/231{
			egen d_prccs_`ccs' = anymatch(prccs*), values(`ccs')
		}
	save "${SIDDataPath}/riskpredict/fl_med_1016_d_prccs.dta", replace

	use "${SIDDataPath}/riskpredict/fl_med_1016_d_prccs.dta", clear
	keep key visitlink daystoevent d_prccs_*

	local rangestatinput ""
	forval ccs = 1/231{
		local rangestatinput "`rangestatinput' d_prccs_`ccs'_30days = d_prccs_`ccs'"
	}
	rangestat (sum) `rangestatinput', interval(daystoevent -30 0) by(visitlink) excludeself

	local rangestatinput ""
	forval ccs = 1/231{
		local rangestatinput "`rangestatinput' d_prccs_`ccs'_365days = d_prccs_`ccs'"
	}
	rangestat (sum) `rangestatinput', interval(daystoevent -365 0) by(visitlink) excludeself

	save "${SIDDataPath}/riskpredict/fl_med_1016_recentprccs.dta", replace
} // makepredictors
	
	

*-------------------------------------------------- 
* Create dataset with dummies for chronic conditions
*--------------------------------------------------
if `chronb' == 1{
	di "SID 2010"
	use "${SIDPath}/2010/fl_sid_2010_dx_pr_grps.dta", clear
	keep key chronb*
	gen year = 2010
	gen inpatient = 1

	di "SEDD 2010"
	preserve
		use "${SEDDPath}/2010/fl_sedd_2010_dx_pr_grps.dta", clear
		rename *,lower
		keep key chron
		tempfile temp
		save `temp', replace
	restore
	append using `temp'
	
	replace year = 2010 if missing(year)
	replace inpatient = 0 if missing(inpatient)

	di "SID 2011"
	preserve
		use "${SIDPath}/2011/fl_sid_2011_dx_pr_grps.dta", clear
		rename *, lower
		keep key chron
		tempfile temp
		save `temp', replace
	restore
	append using `temp'
	replace year = 2011 if missing(year)
	replace inpatient = 1 if missing(inpatient)

	di "SEDD 2011"
	preserve
		use "${SEDDPath}/2011/fl_sedd_2011_dx_pr_grps.dta", clear
		rename *, lower
		keep key chron
		tempfile temp
		save `temp', replace
	restore
	append using `temp'
	replace year = 2011 if missing(year)
	replace inpatient = 0 if missing(inpatient)

	di "SID 2012"
	append using "${SIDPath}/2012/fl_sid_2012_dx_pr_grps.dta", keep(key chron*)
	replace year = 2012 if missing(year)
	replace inpatient = 1 if missing(inpatient)

	di "SEDD 2012"
	append 	using "${SEDDPath}/2012/fl_sedd_2012_dx_pr_grps.dta", keep(key chron*)
	replace year = 2012 if missing(year)
	replace inpatient = 0 if missing(inpatient)

	di "SID 2013"
	append using "${SIDPath}/2013/fl_sid_2013_dx_pr_grps.dta", keep(key chron*)
	replace year = 2013 if missing(year)
	replace inpatient = 1 if missing(inpatient)

	di "SEDD 2013"
	append 	using "${SEDDPath}/2013/fl_sedd_2013_dx_pr_grps.dta", keep(key chron*)
	replace year = 2013 if missing(year)
	replace inpatient = 0 if missing(inpatient)

	di "SID 2014"
	append using "${SIDPath}/2014/fl_sid_2014_dx_pr_grps.dta", keep(key chron* bodysystem*)
	replace year = 2014 if missing(year)
	replace inpatient = 1 if missing(inpatient)

	di "SEDD 2014"
	append 	using "${SEDDPath}/2014/fl_sedd_2014_dx_pr_grps.dta", keep(key chron* bodysystem*)
	replace year = 2014 if missing(year)
	replace inpatient = 0 if missing(inpatient)


	forval i = 2015/2015 {
		*di "`i'"
		*if `i' == 2015{
			append using "${SIDPath}/`i'/fl_sid_`i'q1q3_dx_pr_grps.dta", keep(key bodysystem* chron*)
			replace year = `i' if missing(year)
			replace inpatient = 1 if missing(inpatient)
			append using "${SEDDPath}/`i'/fl_sedd_`i'q1q3_dx_pr_grps.dta", keep(key BODYSYSTEM* CHRON*)
			replace year = `i' if missing(year)
			replace inpatient = 0 if missing(inpatient)
			append using "${SIDPath}/`i'/fl_sid_`i'q4_dx_pr_grps.dta", keep(key i10_bodysystem* i10_chron*)
			replace year = `i' if missing(year)
			replace inpatient = 1 if missing(inpatient)
			append using "${SEDDPath}/`i'/fl_sedd_`i'q4_dx_pr_grps.dta", keep(key I10_BODYSYSTEM* I10_CHRON*)
			replace year = `i' if missing(year)
			replace inpatient = 0 if missing(inpatient)
	}

	// Rename chronb to "bodysystem" to keep it similar across years and harmonize upper and lower case across the years
	forval diag = 1/31{

		if `diag' <= 10{
			replace bodysystem`diag' = chronb`diag' 		if !missing(chronb`diag')
			replace bodysystem`diag' = I10_BODYSYSTEM`diag' if !missing(I10_BODYSYSTEM`diag')
			replace chron`diag'	 	 = I10_CHRON`diag' 		if !missing(I10_CHRON`diag')

			replace bodysystem`diag' = BODYSYSTEM`diag' if !missing(BODYSYSTEM`diag')
			replace chron`diag'	 	 = CHRON`diag' 		if !missing(CHRON`diag')

			replace bodysystem`diag' = i10_bodysystem`diag' if !missing(i10_bodysystem`diag')
			replace chron`diag'	 	 = i10_chron`diag' 		if !missing(i10_chron`diag')
		}
		if `diag' > 10{
			replace bodysystem`diag' = chronb`diag' 		if !missing(chronb`diag')

			replace bodysystem`diag' = i10_bodysystem`diag' if !missing(i10_bodysystem`diag')
			replace chron`diag'	 	 = i10_chron`diag' 		if !missing(i10_chron`diag')

		}
	}

	drop chronb* I10_BODYSYSTEM* I10_CHRON* BODYSYSTEM* CHRON*

	// Generate a variable which is equal to the bodysystem # only if the diagnosis is chronic
	forval diag = 1/31{
		gen chron_bs_diag`diag' = bodysystem`diag' if chron`diag' == 1
	}

	// Generate a variable which is equal to 1 if the patient has a chronic condition with a given bodysystem
	forval bodysystem = 1/18{
		egen d_chron_bs`bodysystem' = anymatch(chron_bs_diag*), values(`bodysystem')
	}

	keep key year d_chron_bs*
	save "${SIDDataPath}/fl_sidsedd_1016_chronb.dta", replace
} // chronb


if `combinepredictors' == 1{
	// Combine all data together
	use "${SIDDataPath}/fl_sidsedd_allpredictors_1016.dta", clear
	keep if hcup_ed > 0
	gen 	trad_medicare = 0
	replace trad_medicare = 1 if pay1_x == "A"
	keep if trad_medicare == 1
	merge 1:1 key using "${SIDDataPath}/riskpredict/fl_med_1016_recentvisits.dta", keepusing(*30days *365days los_*) generate(m_visits)
	merge 1:1 key using "${SIDDataPath}/riskpredict/fl_med_1016_recentdxccs.dta", keepusing(d_dxccs*) generate(m_dxccs)
	merge 1:1 key using "${SIDDataPath}/riskpredict/fl_med_1016_recentprccs.dta", keepusing(d_prccs*) generate(m_prccs)
	merge 1:1 key using "${SIDDataPath}/fl_sidsedd_1016_chronb.dta", keep(3) generate(m_chronb) // this is built on the NBER HCUP chronic conditions data from 2020. It has now been updated in 2021, and some of the chronic conditions has changed...
	/* // Generate a variable which is equal to 1 if the patient has a chronic condition with a given bodysystem
	forval bodysystem = 1/18{
		egen d_chron_bs`bodysystem' = anymatch(chronb*), values(`bodysystem')
	} */
	save "${SIDDataPath}/riskpredict/fl_med_1016_allrecent.dta", replace
} // combine_predictors


log close